In [1]:
import numpy as np
import pandas as pd

In [2]:
insurance = pd.read_csv('insurance.csv')

In [11]:
# head方法默认返回前5行，相应的，有tail方法返回最后5行
insurance.head()

Unnamed: 0,id,age,sex,bmi,children,smoker,region,charges
0,15634602,19,female,27.9,0,yes,southwest,16884.924
1,15647311,18,male,33.77,1,no,southeast,1725.5523
2,15619304,28,male,33.0,3,no,southeast,4449.462
3,15701354,33,male,22.705,0,no,northwest,21984.47061
4,15737888,32,male,28.88,0,no,northwest,3866.8552


In [12]:
# info方法查看数据集的信息，还有一个describe方法后面用到
insurance.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1340 entries, 0 to 1339
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        1340 non-null   int64  
 1   age       1336 non-null   object 
 2   sex       1340 non-null   object 
 3   bmi       1332 non-null   float64
 4   children  1340 non-null   int64  
 5   smoker    1340 non-null   object 
 6   region    1340 non-null   object 
 7   charges   1339 non-null   object 
dtypes: float64(1), int64(2), object(5)
memory usage: 83.9+ KB


In [14]:
# 这个前提是知道数据集中的混杂字符，用replace方法替换为缺失值
insurance.replace(['?'], np.nan, inplace = True)

In [15]:
insurance.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1340 entries, 0 to 1339
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        1340 non-null   int64  
 1   age       1336 non-null   object 
 2   sex       1340 non-null   object 
 3   bmi       1332 non-null   float64
 4   children  1340 non-null   int64  
 5   smoker    1340 non-null   object 
 6   region    1340 non-null   object 
 7   charges   1339 non-null   object 
dtypes: float64(1), int64(2), object(5)
memory usage: 83.9+ KB


In [18]:
# 进行缺失值的处理
# 1.前向填充，与后向填充类似'ffill'/'bfill'
insurance.bmi.fillna(method = 'ffill', inplace = True)
# 2.均值填充， 类似的， 0填充， 常数填充， 众数、中位数等
insurance['bmi'].fillna(insurance['bmi'].mean(), inplace = True)
# 3.删除含有缺失值的行
# insurance.dropna(subset = ['bmi'], inplace = True)
# 4.插值填充
insurance.interpolate(method = 'linear', inplace = True)

  insurance.bmi.fillna(method = 'ffill', inplace = True)
  insurance.interpolate(method = 'linear', inplace = True)


In [19]:
insurance.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1340 entries, 0 to 1339
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        1340 non-null   int64  
 1   age       1336 non-null   object 
 2   sex       1340 non-null   object 
 3   bmi       1340 non-null   float64
 4   children  1340 non-null   int64  
 5   smoker    1340 non-null   object 
 6   region    1340 non-null   object 
 7   charges   1339 non-null   object 
dtypes: float64(1), int64(2), object(5)
memory usage: 83.9+ KB


In [21]:
insurance.dropna(axis = 0, inplace = True)
insurance.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1336 entries, 0 to 1339
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        1336 non-null   int64  
 1   age       1336 non-null   object 
 2   sex       1336 non-null   object 
 3   bmi       1336 non-null   float64
 4   children  1336 non-null   int64  
 5   smoker    1336 non-null   object 
 6   region    1336 non-null   object 
 7   charges   1336 non-null   object 
dtypes: float64(1), int64(2), object(5)
memory usage: 93.9+ KB


In [24]:
# 重复值处理
# 一般来说，重复值可能不是错误，而是由数据收集的方式或者业务逻辑决定，因此删除之前需要考虑

insurance.drop_duplicates(subset = 'id', keep = 'first', inplace = True)

In [25]:
insurance.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1334 entries, 0 to 1337
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        1334 non-null   int64  
 1   age       1334 non-null   object 
 2   sex       1334 non-null   object 
 3   bmi       1334 non-null   float64
 4   children  1334 non-null   int64  
 5   smoker    1334 non-null   object 
 6   region    1334 non-null   object 
 7   charges   1334 non-null   object 
dtypes: float64(1), int64(2), object(5)
memory usage: 93.8+ KB


In [26]:
insurance.describe()

Unnamed: 0,id,bmi,children
count,1334.0,1334.0,1334.0
mean,15690830.0,30.689059,1.096702
std,71924.57,6.097047,1.205926
min,15565700.0,15.96,0.0
25%,15628230.0,26.315,0.0
50%,15690040.0,30.4,1.0
75%,15750770.0,34.7525,2.0
max,15815360.0,53.13,5.0


In [27]:
insurance['age'] = insurance['age'].astype(float)
insurance['charges'] = insurance['charges'].astype(float)

In [28]:
insurance.describe()

Unnamed: 0,id,age,bmi,children,charges
count,1334.0,1334.0,1334.0,1334.0,1334.0
mean,15690830.0,39.206897,30.689059,1.096702,13240.732052
std,71924.57,14.047313,6.097047,1.205926,12094.420611
min,15565700.0,18.0,15.96,0.0,1121.8739
25%,15628230.0,27.0,26.315,0.0,4724.369462
50%,15690040.0,39.0,30.4,1.0,9369.61575
75%,15750770.0,51.0,34.7525,2.0,16584.318157
max,15815360.0,64.0,53.13,5.0,63770.42801


In [29]:
# min-max归一化
insurance['bmi_norm'] = (insurance['bmi'] - insurance['bmi'].min()) / (insurance['bmi'].max() - insurance['bmi'].min())
insurance.head()

Unnamed: 0,id,age,sex,bmi,children,smoker,region,charges,bmi_norm
0,15634602,19.0,female,27.9,0,yes,southwest,16884.924,0.321227
1,15647311,18.0,male,33.77,1,no,southeast,1725.5523,0.47915
2,15619304,28.0,male,33.0,3,no,southeast,4449.462,0.458434
3,15701354,33.0,male,22.705,0,no,northwest,21984.47061,0.181464
4,15737888,32.0,male,28.88,0,no,northwest,3866.8552,0.347592


In [35]:
# z-score标准化
insurance['bmi_standard'] = (insurance["bmi"]-insurance["bmi"].mean())/insurance["bmi"].std()
insurance.head()

Unnamed: 0,id,age,sex,bmi,children,smoker,region,charges,bmi_norm,bmi_standard
0,15634602,19.0,female,27.9,0,yes,southwest,16884.924,0.321227,-0.457444
1,15647311,18.0,male,33.77,1,no,southeast,1725.5523,0.47915,0.505317
2,15619304,28.0,male,33.0,3,no,southeast,4449.462,0.458434,0.379026
3,15701354,33.0,male,22.705,0,no,northwest,21984.47061,0.181464,-1.309496
4,15737888,32.0,male,28.88,0,no,northwest,3866.8552,0.347592,-0.296711


In [37]:
bins=[0,22,26,30,100]
insurance["bmi_counts"]=pd.cut(insurance["bmi"],bins)  # cut方法返回一个Sereis， 根据'bmi'分组， 类似下方输出
insurance.head()

Unnamed: 0,id,age,sex,bmi,children,smoker,region,charges,bmi_norm,bmi_standard,bmi_counts
0,15634602,19.0,female,27.9,0,yes,southwest,16884.924,0.321227,-0.457444,"(26, 30]"
1,15647311,18.0,male,33.77,1,no,southeast,1725.5523,0.47915,0.505317,"(30, 100]"
2,15619304,28.0,male,33.0,3,no,southeast,4449.462,0.458434,0.379026,"(30, 100]"
3,15701354,33.0,male,22.705,0,no,northwest,21984.47061,0.181464,-1.309496,"(22, 26]"
4,15737888,32.0,male,28.88,0,no,northwest,3866.8552,0.347592,-0.296711,"(26, 30]"


In [38]:
insurance.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1334 entries, 0 to 1337
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   id            1334 non-null   int64   
 1   age           1334 non-null   float64 
 2   sex           1334 non-null   object  
 3   bmi           1334 non-null   float64 
 4   children      1334 non-null   int64   
 5   smoker        1334 non-null   object  
 6   region        1334 non-null   object  
 7   charges       1334 non-null   float64 
 8   bmi_norm      1334 non-null   float64 
 9   bmi_standard  1334 non-null   float64 
 10  bmi_counts    1334 non-null   category
dtypes: category(1), float64(5), int64(2), object(3)
memory usage: 116.2+ KB


In [39]:
from sklearn.preprocessing import LabelEncoder

In [40]:
#性别
le = LabelEncoder()
le.fit(insurance.sex.drop_duplicates()) 
insurance.sex = le.transform(insurance.sex)

In [41]:
# 是否抽烟
le.fit(insurance.smoker.drop_duplicates()) 
insurance.smoker = le.transform(insurance.smoker)

In [42]:
# 地区
le.fit(insurance.region.drop_duplicates()) 
insurance.region = le.transform(insurance.region)

In [43]:
# BMI 分组
insurance["bmi_counts"]=insurance["bmi_counts"].astype(str)
le.fit(insurance.bmi_counts.drop_duplicates()) 
insurance.bmi_counts = le.transform(insurance.bmi_counts)

In [45]:
insurance.head(20)

Unnamed: 0,id,age,sex,bmi,children,smoker,region,charges,bmi_norm,bmi_standard,bmi_counts
0,15634602,19.0,0,27.9,0,1,3,16884.924,0.321227,-0.457444,2
1,15647311,18.0,1,33.77,1,0,2,1725.5523,0.47915,0.505317,3
2,15619304,28.0,1,33.0,3,0,2,4449.462,0.458434,0.379026,3
3,15701354,33.0,1,22.705,0,0,1,21984.47061,0.181464,-1.309496,1
4,15737888,32.0,1,28.88,0,0,1,3866.8552,0.347592,-0.296711,2
5,15574012,31.0,0,25.74,0,0,2,3756.6216,0.263115,-0.811714,1
6,15592531,46.0,0,33.44,1,0,2,8240.5896,0.470272,0.451192,3
7,15656148,37.0,0,27.74,3,0,1,7281.5056,0.316922,-0.483687,2
8,15792365,37.0,1,29.83,2,0,0,6406.4107,0.37315,-0.140898,2
9,15592389,60.0,0,25.84,0,0,1,28923.13692,0.265806,-0.795313,1
