# 数据预处理 

## 读取数据 

In [1]:
import os

path = os.path.join('..','data')
#创建文件夹
os.makedirs(path,exist_ok=True)
#文件地址，不一定存在
data_file = os.path.join(path,'house_tiny.csv')

with open(data_file,'w') as f:
    f.write('NumRooms,Alley,Price\n')
    f.write('NA,Pave,127500\n')
    f.write('2,NA,106000\n')
    f.write('4,NA,178100\n')
    f.write('NA,NA,140000\n')

In [2]:
import pandas as pd

data = pd.read_csv(data_file)
data

Unnamed: 0,NumRooms,Alley,Price
0,,Pave,127500
1,2.0,,106000
2,4.0,,178100
3,,,140000


## 处理缺失值

In [3]:
inputs,outputs = data.iloc[:,0:2],data.iloc[:,2]
inputs = inputs.fillna(inputs.mean())
inputs

Unnamed: 0,NumRooms,Alley
0,3.0,Pave
1,2.0,
2,4.0,
3,3.0,


In [4]:
inputs = pd.get_dummies(inputs,dummy_na=True)
inputs

Unnamed: 0,NumRooms,Alley_Pave,Alley_nan
0,3.0,1,0
1,2.0,0,1
2,4.0,0,1
3,3.0,0,1


## 转换为张量 

In [5]:
import torch

X,y = torch.tensor(inputs.values),torch.tensor(outputs.values)
X,y

(tensor([[3., 1., 0.],
         [2., 0., 1.],
         [4., 0., 1.],
         [3., 0., 1.]], dtype=torch.float64),
 tensor([127500, 106000, 178100, 140000]))

## 练习 

In [6]:
data_file = os.path.join('..','data','exercise_data22.csv')

with open(data_file,'w') as f:
    f.write('NumRooms,NumBath,Alley,Price,Hello\n')
    f.write('NA,2,Pave,127500,NA\n')
    f.write('2,2,NA,106000,NA\n')
    f.write('4,NA,NA,178100,NA\n')
    f.write('NA,2,NA,140000,NA\n')
    f.write('4,2,Pave,127500,NA\n')
    f.write('2,NA,Pave,106000,NA\n')
    f.write('4,NA,NA,178100,NA\n')
    f.write('3,2,NA,140000,NA\n')

In [7]:
data2 = pd.read_csv(data_file)
data2

Unnamed: 0,NumRooms,NumBath,Alley,Price,Hello
0,,2.0,Pave,127500,
1,2.0,2.0,,106000,
2,4.0,,,178100,
3,,2.0,,140000,
4,4.0,2.0,Pave,127500,
5,2.0,,Pave,106000,
6,4.0,,,178100,
7,3.0,2.0,,140000,


In [8]:
numna = data2.isnull().sum(axis=0)
numna

NumRooms    2
NumBath     3
Alley       5
Price       0
Hello       8
dtype: int64

In [9]:
maxid = numna.idxmax()
maxid

'Hello'

In [10]:
del data2['Hello']
data2

Unnamed: 0,NumRooms,NumBath,Alley,Price
0,,2.0,Pave,127500
1,2.0,2.0,,106000
2,4.0,,,178100
3,,2.0,,140000
4,4.0,2.0,Pave,127500
5,2.0,,Pave,106000
6,4.0,,,178100
7,3.0,2.0,,140000


In [11]:
inputs,outputs = data2.iloc[:,:3],data2.iloc[:,3]
inputs,outputs

(   NumRooms  NumBath Alley
 0       NaN      2.0  Pave
 1       2.0      2.0   NaN
 2       4.0      NaN   NaN
 3       NaN      2.0   NaN
 4       4.0      2.0  Pave
 5       2.0      NaN  Pave
 6       4.0      NaN   NaN
 7       3.0      2.0   NaN,
 0    127500
 1    106000
 2    178100
 3    140000
 4    127500
 5    106000
 6    178100
 7    140000
 Name: Price, dtype: int64)

In [12]:
inputs = inputs.fillna(inputs.mean())
inputs

Unnamed: 0,NumRooms,NumBath,Alley
0,3.166667,2.0,Pave
1,2.0,2.0,
2,4.0,2.0,
3,3.166667,2.0,
4,4.0,2.0,Pave
5,2.0,2.0,Pave
6,4.0,2.0,
7,3.0,2.0,


In [13]:
inputs = pd.get_dummies(inputs,dummy_na=True)
inputs

Unnamed: 0,NumRooms,NumBath,Alley_Pave,Alley_nan
0,3.166667,2.0,1,0
1,2.0,2.0,0,1
2,4.0,2.0,0,1
3,3.166667,2.0,0,1
4,4.0,2.0,1,0
5,2.0,2.0,1,0
6,4.0,2.0,0,1
7,3.0,2.0,0,1


In [14]:
X2,y2 = torch.tensor(inputs.values),torch.tensor(outputs.values)
X2,y2

(tensor([[3.1667, 2.0000, 1.0000, 0.0000],
         [2.0000, 2.0000, 0.0000, 1.0000],
         [4.0000, 2.0000, 0.0000, 1.0000],
         [3.1667, 2.0000, 0.0000, 1.0000],
         [4.0000, 2.0000, 1.0000, 0.0000],
         [2.0000, 2.0000, 1.0000, 0.0000],
         [4.0000, 2.0000, 0.0000, 1.0000],
         [3.0000, 2.0000, 0.0000, 1.0000]], dtype=torch.float64),
 tensor([127500, 106000, 178100, 140000, 127500, 106000, 178100, 140000]))