## 결항 통계 : 데이터 전처리

In [1]:
import pandas as pd

col = ['공항명', '출/도착', '운항', '기상', 'A/C접속', 'A/C정비', '여객처리', '복합원인', '기타', '계']
# 인천공항 
incheon_cancel = pd.read_csv("C:/Data/airport/결항통계_인천공항.csv", encoding = 'cp949')
print("[인천공항 원본 데이터]")
incheon_cancel

[인천공항 원본 데이터]


Unnamed: 0,기상,A/C접속,A/C정비,여객처리 및 승무원관련,복합원인,기타,운항(편),총 지연편수
0,217,134,222,13,1,1662,1432905,2249


In [2]:
# 열 이름 변경
incheon_cancel.columns = ['기상', 'A/C접속', 'A/C정비', '여객처리', '복합원인', '기타', '운항', '계']

# 공항명 열 추가
incheon_cancel['공항명'] = '인천'

# 열 순서 변경
incheon_cancel = incheon_cancel[['공항명', '운항', '기상', 'A/C접속', 'A/C정비', '여객처리', '복합원인', '기타', '계']]

print("[인천공항 전처리 데이터]")
incheon_cancel

[인천공항 전처리 데이터]


Unnamed: 0,공항명,운항,기상,A/C접속,A/C정비,여객처리,복합원인,기타,계
0,인천,1432905,217,134,222,13,1,1662,2249


In [3]:
# 국내공항(인천공항 제외)
airport_cancel = pd.read_csv("C:/Data/airport/결항통계_국내공항.csv", encoding = 'cp949')

print("[국내공항(인천 제외) 원본 데이터]")
airport_cancel.head(10)

[국내공항(인천 제외) 원본 데이터]


Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,결항 통계,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9
0,,,,,,,,,,
1,검색 기간,2017.01~2021.12,,,,,,,,
2,노선 구분,전체,,,,,,,,
3,,,,,,,,,,
4,공항명,출/도착,운항(편),기상,A/C접속,A/C장비,여객처리,복합원인,기타,계
5,김포,출발,339517,2220,158,114,7,0,41,2540
6,김포,도착,339927,1990,534,104,10,0,220,2858
7,김포,계,679444,4210,692,218,17,0,261,5398
8,김해,출발,220121,1272,156,37,6,0,168,1639
9,김해,도착,220286,1440,121,59,3,1,178,1802


In [4]:
# 기간, 노선 구분을 나타내는 행 제거
airport_cancel = airport_cancel.drop([0, 1, 2, 3, 4])

# 열 이름 변경(공통되는 열 이름)
airport_cancel.columns = col

# 결측값 처리 
airport_cancel = airport_cancel.dropna(thresh = 9)

# 합계값만 필터링
mask = airport_cancel['출/도착'] == '계'
airport_cancel = airport_cancel[mask]

# 출/도착 열 삭제
airport_cancel = airport_cancel.drop(['출/도착'], axis = 1)

print("[국내공항(인천 제외) 전처리 데이터]")
airport_cancel

[국내공항(인천 제외) 전처리 데이터]


Unnamed: 0,공항명,운항,기상,A/C접속,A/C정비,여객처리,복합원인,기타,계
7,김포,679444,4210,692,218,17,0,261,5398
10,김해,440407,2712,277,96,9,1,346,3441
13,제주,809463,5028,636,198,5,1,353,6221
16,대구,107511,481,28,14,0,0,153,676
19,광주,67860,532,60,9,2,0,55,658
22,무안,13567,37,4,1,0,0,15,57
25,청주,81206,404,15,19,0,0,69,507
28,양양,5875,70,5,0,0,0,0,75
31,여수,29030,432,20,1,0,0,1,454
34,울산,31509,629,123,6,2,2,2,764


In [5]:
# 두 데이터프레임 병합(인천 + 국내공항(인천제외))
airports_cancel = pd.concat([incheon_cancel, airport_cancel])
airports_cancel = airports_cancel.reset_index()
airports_cancel.drop('index', axis = 1, inplace = True)
print("[국내공항 데이터]")
airports_cancel

[국내공항 데이터]


Unnamed: 0,공항명,운항,기상,A/C접속,A/C정비,여객처리,복합원인,기타,계
0,인천,1432905,217,134,222,13,1,1662,2249
1,김포,679444,4210,692,218,17,0,261,5398
2,김해,440407,2712,277,96,9,1,346,3441
3,제주,809463,5028,636,198,5,1,353,6221
4,대구,107511,481,28,14,0,0,153,676
5,광주,67860,532,60,9,2,0,55,658
6,무안,13567,37,4,1,0,0,15,57
7,청주,81206,404,15,19,0,0,69,507
8,양양,5875,70,5,0,0,0,0,75
9,여수,29030,432,20,1,0,0,1,454


In [7]:
# 데이터 타입 변경
airports_cancel = airports_cancel.astype({'운항' : 'int', '기상' : 'int', 'A/C접속' : 'int', 
                 'A/C정비' : 'int', '여객처리' : 'int', '복합원인' : 'int', '기타' : 'int', '계' : 'int'})

# 요약변수로 결항률 생성
# 결항율 계산 : 백분위
airports_cancel['결항률'] = (airports_cancel.계 / airports_cancel.운항) * 100
airports_cancel

Unnamed: 0,공항명,운항,기상,A/C접속,A/C정비,여객처리,복합원인,기타,계,결항율,결항률
0,인천,1432905,217,134,222,13,1,1662,2249,0.156954,0.156954
1,김포,679444,4210,692,218,17,0,261,5398,0.794473,0.794473
2,김해,440407,2712,277,96,9,1,346,3441,0.781323,0.781323
3,제주,809463,5028,636,198,5,1,353,6221,0.768534,0.768534
4,대구,107511,481,28,14,0,0,153,676,0.628773,0.628773
5,광주,67860,532,60,9,2,0,55,658,0.969643,0.969643
6,무안,13567,37,4,1,0,0,15,57,0.420137,0.420137
7,청주,81206,404,15,19,0,0,69,507,0.624338,0.624338
8,양양,5875,70,5,0,0,0,0,75,1.276596,1.276596
9,여수,29030,432,20,1,0,0,1,454,1.563899,1.563899


In [8]:
airports_cancel.to_csv("C:/Data/airport/결항통계.csv")