# 数据预处理
## 读取数据集

In [1]:
import os
os.makedirs(os.path.join('..','data'),exist_ok=True)
data_file = os.path.join('..','data','house_tiny.csv')
with open(data_file,'w') as f:
    f.write('NumRooms,Alley,Price\n')  # 列名
    f.write('NA,Pave,127500\n')  # 每行表示一个数据样本
    f.write('2,NA,106000\n')
    f.write('4,NA,178100\n')
    f.write('NA,NA,140000\n')

## 读取数据集

In [2]:
import pandas as pd

data = pd.read_csv(data_file)
print(data)

   NumRooms Alley   Price
0       NaN  Pave  127500
1       2.0   NaN  106000
2       4.0   NaN  178100
3       NaN   NaN  140000


## 处理缺失值

In [3]:
inputs, outputs = data.iloc[:, 0:2], data.iloc[:, 2]
inputs = inputs.fillna(inputs.mean(numeric_only=True))
print(inputs)

   NumRooms Alley
0       3.0  Pave
1       2.0   NaN
2       4.0   NaN
3       3.0   NaN


In [4]:
inputs = pd.get_dummies(inputs,dummy_na=True)
print(inputs)

   NumRooms  Alley_Pave  Alley_nan
0       3.0        True      False
1       2.0       False       True
2       4.0       False       True
3       3.0       False       True


## 转换为张量格式

In [5]:
import torch
X = torch.tensor(inputs.to_numpy(dtype=float))
y = torch.tensor(outputs.to_numpy(dtype=float))
X,y

(tensor([[3., 1., 0.],
         [2., 0., 1.],
         [4., 0., 1.],
         [3., 0., 1.]], dtype=torch.float64),
 tensor([127500., 106000., 178100., 140000.], dtype=torch.float64))

## 练习
### 第一题

In [6]:
my_data_file = os.path.join('..','data','my_data.csv')
with open(my_data_file,'w') as f:
    f.write('numrooms,alley,price\n') #列名
    f.write('3,Pave,250000\n')
    f.write('4,,300000\n')
    f.write('5,,350000\n')
    f.write(',Pave,200000\n')
    f.write('4,,\n')
    f.write('2,Pave,150000\n')
    f.write(',Gravel,400000\n')
my_data = pd.read_csv(my_data_file)
my_data

Unnamed: 0,numrooms,alley,price
0,3.0,Pave,250000.0
1,4.0,,300000.0
2,5.0,,350000.0
3,,Pave,200000.0
4,4.0,,
5,2.0,Pave,150000.0
6,,Gravel,400000.0


In [7]:
count = 0
count_max = 0
labels = ['numrooms','alley','price']
for label in labels:
    count = my_data[label].isna().sum()
    if count > count_max:
        count_max = count
        flag = label
my_data_new = my_data.drop(flag,axis=1)
my_data_new

Unnamed: 0,numrooms,price
0,3.0,250000.0
1,4.0,300000.0
2,5.0,350000.0
3,,200000.0
4,4.0,
5,2.0,150000.0
6,,400000.0


### 第二题

In [8]:
my_data_tensor = torch.tensor(my_data_new.to_numpy(), dtype=torch.float32)
my_data_tensor

tensor([[3.0000e+00, 2.5000e+05],
        [4.0000e+00, 3.0000e+05],
        [5.0000e+00, 3.5000e+05],
        [       nan, 2.0000e+05],
        [4.0000e+00,        nan],
        [2.0000e+00, 1.5000e+05],
        [       nan, 4.0000e+05]])