# Data IO

### 초기화

In [1]:
import os
import numpy as np
import pandas as pd

### 작업 경로 변경하기

#### 절대 경로

In [2]:
os.getcwd()

'c:\\git\\DataAnalysisProcess'

In [3]:
os.chdir(path = r'C:\git\DataAnalysisProcess\data')

In [4]:
os.getcwd()

'C:\\git\\DataAnalysisProcess\\data'

#### 상대 경로

In [5]:
os.chdir(path = '../data')
os.getcwd()

'C:\\git\\DataAnalysisProcess\\data'

### 파일 목록 출력

In [6]:
os.listdir()

['APT_Price_2023.xlsx',
 'iris.csv',
 'iris.json',
 'iris.parquet',
 'iris.tsv',
 'iris.xlsx',
 'Stock_News_202301.xlsx']

In [7]:
sorted(os.listdir())

['APT_Price_2023.xlsx',
 'Stock_News_202301.xlsx',
 'iris.csv',
 'iris.json',
 'iris.parquet',
 'iris.tsv',
 'iris.xlsx']

### Read Excel File
#### `.xls`
```python
df1 = pd.read_excel(io = 'iris.xls',  engine='xlrd')
```

#### `.xlsx`
```python
df1 = pd.read_excel(io = 'iris.xlsx',  engine='openpyxl')
```

#### `.xlsm`
```python
df1 = pd.read_excel(io = 'iris.xlsm',  engine='openpyxl')
```

In [8]:
df1 = pd.read_excel(io = 'iris.xlsx',  engine='openpyxl')

In [9]:
df1

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [10]:
df1.index

RangeIndex(start=0, stop=150, step=1)

In [11]:
df1.columns

Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
       'species'],
      dtype='object')

In [12]:
df1.values

array([[5.1, 3.5, 1.4, 0.2, 'setosa'],
       [4.9, 3.0, 1.4, 0.2, 'setosa'],
       [4.7, 3.2, 1.3, 0.2, 'setosa'],
       [4.6, 3.1, 1.5, 0.2, 'setosa'],
       [5.0, 3.6, 1.4, 0.2, 'setosa'],
       [5.4, 3.9, 1.7, 0.4, 'setosa'],
       [4.6, 3.4, 1.4, 0.3, 'setosa'],
       [5.0, 3.4, 1.5, 0.2, 'setosa'],
       [4.4, 2.9, 1.4, 0.2, 'setosa'],
       [4.9, 3.1, 1.5, 0.1, 'setosa'],
       [5.4, 3.7, 1.5, 0.2, 'setosa'],
       [4.8, 3.4, 1.6, 0.2, 'setosa'],
       [4.8, 3.0, 1.4, 0.1, 'setosa'],
       [4.3, 3.0, 1.1, 0.1, 'setosa'],
       [5.8, 4.0, 1.2, 0.2, 'setosa'],
       [5.7, 4.4, 1.5, 0.4, 'setosa'],
       [5.4, 3.9, 1.3, 0.4, 'setosa'],
       [5.1, 3.5, 1.4, 0.3, 'setosa'],
       [5.7, 3.8, 1.7, 0.3, 'setosa'],
       [5.1, 3.8, 1.5, 0.3, 'setosa'],
       [5.4, 3.4, 1.7, 0.2, 'setosa'],
       [5.1, 3.7, 1.5, 0.4, 'setosa'],
       [4.6, 3.6, 1.0, 0.2, 'setosa'],
       [5.1, 3.3, 1.7, 0.5, 'setosa'],
       [4.8, 3.4, 1.9, 0.2, 'setosa'],
       [5.0, 3.0, 1.6, 0.

- 소괄호 안에 shift + tab 키를 동시에 누르면 함수의 설명서를 읽을 수 있음

In [13]:
df1.head(n = 10)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
5,5.4,3.9,1.7,0.4,setosa
6,4.6,3.4,1.4,0.3,setosa
7,5.0,3.4,1.5,0.2,setosa
8,4.4,2.9,1.4,0.2,setosa
9,4.9,3.1,1.5,0.1,setosa


In [14]:
df1.tail()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica
149,5.9,3.0,5.1,1.8,virginica


In [15]:
# index 옵션 - 기본 True
df1.to_excel('test.xlsx', index=False)

In [16]:
os.listdir()

['APT_Price_2023.xlsx',
 'iris.csv',
 'iris.json',
 'iris.parquet',
 'iris.tsv',
 'iris.xlsx',
 'Stock_News_202301.xlsx',
 'test.xlsx']

In [17]:
df2 = pd.read_csv('iris.csv')

In [18]:
df2.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [19]:
df2.to_csv('test.csv', index=False)

In [20]:
os.listdir()

['APT_Price_2023.xlsx',
 'iris.csv',
 'iris.json',
 'iris.parquet',
 'iris.tsv',
 'iris.xlsx',
 'Stock_News_202301.xlsx',
 'test.csv',
 'test.xlsx']

#### Tab separated Values
- Pandas의 read_csv 함수는 모든 텍스트 형태의 파일을 읽을 수 있음

In [21]:
df3 = pd.read_csv('iris.tsv', sep='\t')

In [22]:
df3.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [23]:
df3.to_csv('test.tsv', index=False, sep='\t')

In [24]:
df4 = pd.read_json('iris.json')

In [25]:
df4.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [26]:
df4.to_json('test.json')

#### parquet 파일
 - 하둡에서 컬럼 방식으로 저장되는 파일

In [27]:
df5 = pd.read_parquet('iris.parquet')

In [28]:
df5.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [29]:
df5.to_parquet('test.parquest')

#### 테스트 파일 삭제
 - `[원소 for 변수 in 범위 if 조건]`

In [30]:
files = os.listdir()

In [31]:
files

['APT_Price_2023.xlsx',
 'iris.csv',
 'iris.json',
 'iris.parquet',
 'iris.tsv',
 'iris.xlsx',
 'Stock_News_202301.xlsx',
 'test.csv',
 'test.json',
 'test.parquest',
 'test.tsv',
 'test.xlsx']

In [32]:
for file in files:
    if 'test' in file:
        print(file)

test.csv
test.json
test.parquest
test.tsv
test.xlsx


In [33]:
result = [file for file in files if 'test' in file]
result

['test.csv', 'test.json', 'test.parquest', 'test.tsv', 'test.xlsx']

In [34]:
for file in result:
    os.remove(file)